library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ tibble 3.1.7 ✔ purrr 0.3.4
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(knitr)
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
load("~/ShinyApps/defense2022/SBIR_Project/SBIR_Project_data.rdata")
branch <- group_by(SBIR_STTR, Branch, `Award Year`) %>%
summarise(total=sum(`Award Amount`),C = length(Branch),avg = total/C) %>%
mutate(year = as.character(`Award Year`))
## `summarise()` has grouped output by 'Branch'. You can override using the
## `.groups` argument.
ggplotly (ggplot( data = branch,
aes( x = reorder(Branch,-C),
y = C,fill=year)) +
geom_bar( stat = 'identity') +
labs(title = '各部門補助案數量',
x = '部門',
y = '數量') +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) )
因為Air Force、Army、Navy最多,總補助金額也最高
ggplotly(branch%>%
ggplot(aes(x = reorder(Branch,-total), y=total/10000, fill=year)) +
geom_col() +
labs(title = '各部門總補助金額',
x = '部門',
y = '總補助金額(萬美金)')+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)))
因為各個部門的data筆數差蠻多的,所以看平均一個補助案的補助金額。
2020和2021加總:平均補助金額變成 Defense threat reduction 最高
2020年:平均補助金額是Strategic Capabilities Office最高
ggplotly(branch %>%
ggplot(aes(x = reorder(Branch,-total/C), y=total/10000/C, fill=year)) +
geom_col() +
labs(title = '各部門平均補助金額',
x = '部門',
y = '平均補助金額(萬美金)')+theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1)) )
df1$kw = str_replace_all(
df1$kw,"AI","artificial intelligence")
df1$kw = str_replace_all(
df1$kw,"Artificial Intelligence(AI)", "artificial intelligence")
df1$kw = str_replace_all(
df1$kw,"Artificial Intelligence(artificial intelligence)","artificial intelligence")
df1$kw = str_replace_all(df1$kw,"Autonomy","Automation")
df1$kw = str_replace_all(
df1$kw,"AUTONOMOUS","Automation")
df1$kw <- gsub(" ", "_", df1$kw) #關鍵字用底線代替空格
df2 <- df1 %>% group_by(kw) %>%
summarise(count_kw = length(kw)) %>%arrange(desc(count_kw))
df2 <- na.omit(df2) %>% head(20)
ggplotly(ggplot(df2,aes(x = reorder(kw,count_kw), y = count_kw))+
geom_bar(stat = "identity")+
labs(title = '關鍵字(標籤)出現頻率',
x = NULL,
y = '頻率')+
coord_flip()+
geom_text(mapping = aes(label = count_kw),
size = 3, colour = 'black', vjust = 0.5))
出現最多的是artificial intelligence、machine learning
除了有AR、VR之外,相關的關鍵字還有mixed reality、training、simulation、modeling、cybersecurity、blockchain、IoT、devsecops(有關資訊安全的應用程式)、Kubernetes(用於自動部署、擴展和管理容器化應用程序的公開系統),主要是和航空、訓練、成像、模擬相關,也有和軟體開發、資訊安全相關。
和infrared(紅外線的)相關的關鍵字,包含remote sensing、optics(光學的)、 MWIR(Middle Wavelength Infrared)、LWIR(Long Wavelength Infrared)、Quantum cascade lasers(量子級聯雷射器)、PNT(Position Navigation and Timing),主要是和光學、雷射所應用於無人機(可應用MWIR和LWIR)或導航相關。
machine learning和artificial intelligence是資料中出現最多次的關鍵字,有一同出現的關鍵字範圍廣,因此也聚集在關聯式文字雲的中間。 而透過tsne的關聯式文字雲相關的關鍵字有neural networks(神經網絡)、additive manufacturing(積層製造)、algorithm(演算法)、natural language processing、Sensor fusion(傳感器融合) 。主要是和數據、無人機或自駕車、太空、衛星的應用相關。
1.取得原始文集
2.使用AutoPhrase產生初始斷詞字典
3.使用Python,將文集利用斷詞字典把底線相連,利用spacy進行斷詞,最後整理斷詞後的資料匯出csv。
4.使用R,利用udpipe提供的function,進行ngram計算與TF-IDF篩選,挑選一定詞頻的字詞,更新斷詞字典。
5.重複3~4
6.最後再根據TF-IDF等指標篩選出關鍵字字典。
#library(readr)
#dict3 <- read_csv("dict3.csv")
#SBIRtoken_des1 = read.csv('./des_token_r3.csv',stringsAsFactors = F)
#lemma_des <- read.csv('./lemma_des.csv',stringsAsFactors = F)
phrase_freq = lemma_des %>%
filter((token%in%dict3$replacement)) %>%
group_by(token) %>% count() %>%
arrange(desc(n))
ggplotly(
lemma_des %>%
filter((token%in%dict3$replacement)) %>%
group_by(token) %>% count() %>%
arrange(desc(n)) %>%
head(15) %>%
ggplot(aes(x = reorder(token, n), y = n)) +
geom_bar(stat = "identity") +
coord_flip()+
geom_text(mapping = aes(label = n),
size = 3, colour = 'black', vjust = 0.5)+
labs(x = NULL, y = '頻率', title = '斷詞字典詞頻')
)
出現最多的是air force,應該是跟屬於空軍的補助案資料數量有關係
TF:Term Frequency
IDF:Inverse Document Frequency
計算出誰是相對比較重要的字詞
→字詞的重要性隨著在文本出現的頻率越高則越高;在不同文本檔案間出現的次數越高則反而降低。
#去除無效的摘要token
lemma_des$token = str_replace_all(
lemma_des$token,"xxxxx"," ")
lemma_des$token = str_replace_all(
lemma_des$token,"qqq"," ")
lemma_des$token = str_replace_all(
lemma_des$token,"xxx"," ")
lemma_des$token = str_replace_all(
lemma_des$token,"XX"," ")
lemma_des$token = str_replace_all(
lemma_des$token,"BLANK"," ")
lemma_des$token = str_replace_all(
lemma_des$token,"redact"," ")
library(udpipe,lattice)
Abstract_tf_idf1 <- document_term_frequencies(lemma_des[, c("doc_id", "token")])
Abstract_tf_idf1 <- document_term_frequencies_statistics(Abstract_tf_idf1)
stopwords <- stopwords::stopwords("en", source = "smart")
Abstract_tf_idf1 <- Abstract_tf_idf1 %>%
filter(nchar(term) >1) %>%
filter(!(tolower(term) %in% stopwords))
# 每個字平均tf-idf
word_tfidf <-Abstract_tf_idf1%>%
group_by(term) %>%
summarise(tf_idf = mean(tf_idf),freq = sum(freq))
ggplotly(
word_tfidf %>%
arrange(desc(tf_idf)) %>%
head(10) %>%
ggplot(aes(freq,reorder(term,freq))) +
geom_bar(stat = "identity") +
geom_text(mapping = aes(label = freq),
size = 3, colour = 'black', vjust = 0.5)+
labs(x = "頻率", y = NULL, title = 'SBIR/STTR高TF-IDF字的詞頻')
)
# 篩選詞頻大於1
word_tfidf1 <-Abstract_tf_idf1%>%
group_by(term) %>%
summarise(tf_idf = mean(tf_idf), freq = mean(freq)) %>% filter(freq >1)
ggplotly(
word_tfidf1 %>%
arrange(desc(tf_idf)) %>%
head(10) %>%
ggplot(aes(freq,reorder(term,freq))) +
geom_bar(stat = "identity") +
geom_text(mapping = aes(label = freq),
size = 3, colour = 'black', vjust = 0.5)+
labs(x = "頻率", y = NULL, title = 'SBIR/STTR高TF-IDF字的詞頻(篩選詞頻大於1)')
)
SWBLI : separation length of shock wave/boundary layer interaction 衝擊波邊界層相互作用的長度
在摘要中和battery相關的字有li ion(鋰離子)、energy storage、electrode(電極),補助案有和能源儲存、太空、飛機相關。
在摘要中和satellie相關的字有GEO(geosynchronous equatorial orbit地球同步轉移軌道)、LEO(low-Earth orbit低地球軌道)、orbit(軌道)、spacecraft、cislunar(月軌道)。
摘要字詞同一個分群內也有composite(複合材料)、3D列印、積層製造、CMC(陶瓷基複合材料)、combustion(燃燒)、propellant(推進器)相關。 -CMC(Ceramic matrix composite):材料有在高溫下的耐腐蝕和耐磨性,因此適合太空相關的應用
在摘要中和radio相關的字有waveform、SDR(Software Defined Radio)、LPD(low probability of detection)、coherent。
同一個分群內也有Virtual reality、learning、immersive(沈浸式)、simulator。
另一邊同一個分群內也有quantum、photonic(光子)、silicon(矽)、chip。
在摘要中和antenna(天線)相關的字有phased array(相位陣列:由一群天線組成的陣列)、GHz、reciver、analog(模擬)、interference。
同一個分群內也有radar、munition(軍備品)、lethality(致命性)、projectile(彈藥)、wound(傷口)、infection。
A <- SBIR_STTR
A$id = 1:nrow(A)
d3 <- A %>% filter(grepl(".*Photomultiplier|.*PMT|.*night vision goggle",`Research Keywords`)) %>%
full_join( A %>% filter(grepl(".*Photomultiplier|.*PMT|.*night vision goggle", Abstract)))
## Joining, by = c("Company", "Award Title", "Agency", "Branch", "Phase",
## "Program", "Agency Tracking Number", "Contract", "Proposal Award Date",
## "Contract End Date", "Solicitation Number", "Solicitation Year", "Topic
## Code", "Award Year", "Award Amount", "DUNS", "Hubzone Owned", "Socially and
## Economically Disadvantaged", "Woman Owned", "Number Employees", "Company
## Website", "Address1", "Address2", "City", "State", "Zip", "Contact Name",
## "Contact Title", "Contact Phone", "Contact Email", "PI Name", "PI Title",
## "PI Phone", "PI Email", "RI Name", "RI POC Name", "RI POC Phone", "Research
## Keywords", "Abstract", "id")
d3 <- d3 %>% filter(id!=1292 &id!=3782 &id!=1974 &id!=2198& id!=2205& id!= 4292)
d3_PMT <- d3 %>% summarise(id = 1:nrow(d3), kw = `Research Keywords`,Abstract)
d3_b <- d3 %>% group_by(Branch) %>% summarise(c = length(Branch)) %>% na.omit() #有一筆部門是na
ggplotly(
ggplot(d3_b)+
geom_col(aes(x = reorder(Branch,c), y = c ))+
geom_text(aes(x = Branch, y = c ,label = c),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "補助案所屬部門", y = "資料筆數")+
coord_flip() )
d3_PMT$kw <- str_split(d3_PMT$kw,",")
l3<- unnest(d3_PMT, kw)
l3.table <- table(l3$kw) %>% as.data.frame() %>% arrange(desc(Freq))
#59個關鍵字
l3.table20 <- l3.table %>% head(20)
#畫前20大相關連的kw
ggplotly(
ggplot(l3.table20)+
geom_col(aes(x = reorder(Var1,Freq), y = Freq ))+
geom_text(aes(x = Var1, y = Freq ,label = Freq),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "相關的標籤", y = "出現次數")+
coord_flip() )
相關的標籤有augmented reality(AR),head mounted display(HMD,頭戴式裝置),Night vision,Light Security,ambient filter(環境過濾器)
→都是和夜視鏡比較有關
- CLYC :Cs2LiYCl6:Ce 新型伽馬中子復合閃爍晶體
- CLLBC:Cesium Lanthanum Lithium BromoChloride
- CASEVAC(Casualty evacuation)
- Light Secure Special Warfare Display(LSSWD)
補提到光放管的補助案只有兩筆:
1.用矽為基礎的光電倍增管芯片,改善矽光電倍增管silicon photomultipliers (SiPMs),以實現大面積光電倍增器芯片,並使它有輻射耐受性。
2.開發混合鈣鈦礦閃爍體(mixed elpasolite scintillators),結合SiPM (矽光電倍增管)和 PMT提高能量分辨率
其他補助案內容大多都不是以光放管為主:
有兩則提到用主題為AR相關(只是敘述提到夜視鏡而已)、兩則提到其攜帶式電源可供應各種設備(如夜視鏡)、或是新型材料改善傳統的紅外光學器件體積龐大
#AR/VR #有106筆
d4 <- A %>% filter(grepl(".*augmented reality|.*virtual reality", `Research Keywords`))%>%
full_join( A %>% filter(grepl(".*augmented reality|.*virtual reality", Abstract)))
## Joining, by = c("Company", "Award Title", "Agency", "Branch", "Phase",
## "Program", "Agency Tracking Number", "Contract", "Proposal Award Date",
## "Contract End Date", "Solicitation Number", "Solicitation Year", "Topic
## Code", "Award Year", "Award Amount", "DUNS", "Hubzone Owned", "Socially and
## Economically Disadvantaged", "Woman Owned", "Number Employees", "Company
## Website", "Address1", "Address2", "City", "State", "Zip", "Contact Name",
## "Contact Title", "Contact Phone", "Contact Email", "PI Name", "PI Title",
## "PI Phone", "PI Email", "RI Name", "RI POC Name", "RI POC Phone", "Research
## Keywords", "Abstract", "id")
d4_ARVR <- d4 %>% summarise(id = 1:nrow(d4), kw = `Research Keywords`,Abstract)
d4_b <- d4 %>% group_by(Branch) %>% summarise(c = length(Branch))
ggplotly(
ggplot(d4_b)+
geom_col(aes(x = reorder(Branch,c), y = c ))+
geom_text(aes(x = Branch, y = c ,label = c),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "補助案所屬部門", y = "資料筆數")+
coord_flip() )
d4_ARVR$kw <- str_split(d4_ARVR$kw,",")
l4<- unnest(d4_ARVR, kw)
l4.table <- table(l4$kw) %>% as.data.frame() %>% arrange(desc(Freq))
#470個關鍵字
l4.table30 <- l4.table %>% head(30)
#畫前30大相關連的kw
ggplotly(
ggplot(l4.table30)+
geom_col(aes(x = reorder(Var1,Freq), y = Freq ))+
geom_text(aes(x = Var1, y = Freq ,label = Freq),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "相關的標籤", y = "出現次數")+
coord_flip() )
相關的標籤有machine learning,artificial intelligence, SAR(Source Approval Request),lidar(光學雷達),Radar
- UAS(Unmanned Aircraft System 無人機系統)
- ISR(Intelligence, Surveillance, and Reconnaissance):是一種綜合情報和作戰職能,可以定義為協調獲取、處理和提供準確、相關、及時的信息和情報,以支持指揮官的決策過程。
- DLA(Defense Logistics Agency國防後勤局)
補助案大多是和空軍的飛行訓練有關,或是飛行相關應用的延伸(如加強導彈防禦任務、衛星協作工具SCTK),也有部分是醫療模擬培訓系統、機器人(結合AI或3D模型)相關主題,少部分有提到用於海軍提升海軍系統的水下環境感知、或是以頭戴式裝置改善暈眩或是PTSD。
#target drone #只有一筆
d5 <- A %>% filter(grepl(".*target drone", `Research Keywords`))%>%
full_join( A %>% filter(grepl(".*target drone", Abstract)))
## Joining, by = c("Company", "Award Title", "Agency", "Branch", "Phase",
## "Program", "Agency Tracking Number", "Contract", "Proposal Award Date",
## "Contract End Date", "Solicitation Number", "Solicitation Year", "Topic
## Code", "Award Year", "Award Amount", "DUNS", "Hubzone Owned", "Socially and
## Economically Disadvantaged", "Woman Owned", "Number Employees", "Company
## Website", "Address1", "Address2", "City", "State", "Zip", "Contact Name",
## "Contact Title", "Contact Phone", "Contact Email", "PI Name", "PI Title",
## "PI Phone", "PI Email", "RI Name", "RI POC Name", "RI POC Phone", "Research
## Keywords", "Abstract", "id")
#包含drone #228筆
d5_1 <- A %>% filter(grepl(".*target drone|.*drone", `Research Keywords`))%>%
full_join( A %>% filter(grepl(".*target drone|.*dron", Abstract)))
## Joining, by = c("Company", "Award Title", "Agency", "Branch", "Phase",
## "Program", "Agency Tracking Number", "Contract", "Proposal Award Date",
## "Contract End Date", "Solicitation Number", "Solicitation Year", "Topic
## Code", "Award Year", "Award Amount", "DUNS", "Hubzone Owned", "Socially and
## Economically Disadvantaged", "Woman Owned", "Number Employees", "Company
## Website", "Address1", "Address2", "City", "State", "Zip", "Contact Name",
## "Contact Title", "Contact Phone", "Contact Email", "PI Name", "PI Title",
## "PI Phone", "PI Email", "RI Name", "RI POC Name", "RI POC Phone", "Research
## Keywords", "Abstract", "id")
補助案的內容是ASC利用LiDAR(global shutter 3D flash LiDAR)為傳感器,用於測量距離、變化、方位等,並輸出3D點雲。再利用非光束轉向(non-mechanical beam steering)、帶有3D點雲的2D高分辨率影像以生成高分辨率(2D high resolution video with the 3D point cloud )的3D影像來幫助評估導彈。
ASC是Continental Motors軍用LiDAR產品的獨家授權經銷商,希望夠過此方法降低成本將其技術安裝在靶機上。
Global Shutter 3D Flash LiDAR,3D Point Cloud,real time 3D video,high resolution 3D video,Dynamic Range,Field of view,Laser,Focal plane array
提到drone的補助案內容,皆和靶機沒有關係,提到的內容都是其他無人機運用(如:運送、監控、蒐集不同場域的資料);或是無人機系統或技術上的改善。
#gyroscop #只有24筆
d6 <- A %>% filter(grepl(".*gyroscope|.*Gyroscope", Abstract, ))%>%
full_join( A %>% filter(grepl(".*gyroscope|.*Gyroscope",`Research Keywords`)))
## Joining, by = c("Company", "Award Title", "Agency", "Branch", "Phase",
## "Program", "Agency Tracking Number", "Contract", "Proposal Award Date",
## "Contract End Date", "Solicitation Number", "Solicitation Year", "Topic
## Code", "Award Year", "Award Amount", "DUNS", "Hubzone Owned", "Socially and
## Economically Disadvantaged", "Woman Owned", "Number Employees", "Company
## Website", "Address1", "Address2", "City", "State", "Zip", "Contact Name",
## "Contact Title", "Contact Phone", "Contact Email", "PI Name", "PI Title",
## "PI Phone", "PI Email", "RI Name", "RI POC Name", "RI POC Phone", "Research
## Keywords", "Abstract", "id")
d6_gyroscop <- d6 %>% summarise(id = 1:nrow(d6), kw = `Research Keywords`,Abstract)
d6_b <- d6 %>% group_by(Branch) %>% summarise(c = length(Branch))
ggplotly(
ggplot(d6_b)+
geom_col(aes(x = reorder(Branch,c), y = c ))+
geom_text(aes(x = Branch, y = c ,label = c),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "補助案所屬部門", y = "資料筆數")+
coord_flip() )
d6_gyroscop$kw <- str_split(d6_gyroscop$kw,",")
l6<- unnest(d6_gyroscop, kw)
l6.table <- table(l6$kw) %>% as.data.frame() %>% arrange(desc(Freq))
#107個關鍵字
l6.table20 <- l6.table %>% head(20)
#畫前50大相關連的kw
ggplotly(
ggplot(l6.table20)+
geom_col(aes(x = reorder(Var1,Freq), y = Freq ))+
geom_text(aes(x = Var1, y = Freq ,label = Freq),size = 3, colour = 'black', vjust = 0.5)+
labs(x = "相關的標籤", y = "出現次數")+
coord_flip() )
相關的標籤有Accelerometer/Accelerometry(加速儀/加速器),Navigation,Anomalous Dispersion(色散異常),Inertial measurement unit(IMU,慣性測量單位),Superluminal Laser(超光速雷射),Rubidium Atoms(銣)
- IMU 慣性測量單位(Inertial measurement unit,簡稱IMU):是測量物體三軸姿態角(或角速率)以及加速度的裝置。
- MEMS 微機電系統(Microelectromechanical Systems)
- GNC (Guidance,Navigation& Control)
- AFSOC(Air Force Special Operations Command )
補助案大多是和導航有關,像是在特殊環境、太空上、海下的導航;其他也有提到相關的陀螺儀技術(如:熔融石英水盆諧振器陀螺儀 BRG、MEMS 陀螺儀)